rm(list=ls(all=TRUE))
# setwd("....") set your working directory where the legislative speech texts are stored
require(quanteda)
require(readtext)
require(quanteda.seededlda)

# sessionInfo()
# quanteda_1.5.1 
# readtext_0.75
# quanteda.seededlda_0.2

myText <- readtext("texts/*.txt", 
docvarsfrom = "filenames", dvsep = "_", docvarnames = c("Party", "Mission"), encoding = "UTF-8")
names(myText)[1] <- "Name"
myText$Name <- gsub(".txt", "", myText$Name )

####################################################
#### clean texts, creating corpus and then DfM
####################################################

myText$text <- gsub("[\u0092]","'",myText$text)
myText$text <- gsub("[\u2019]","'",myText$text)
myText$text <- gsub("[\u00b4]","'",myText$text)
myText$text <- gsub("[\u00d5]","'",myText$text)
myText$text <- gsub("[\u017e]","'",myText$text)
myText$text <- gsub("[\u017d]","'",myText$text)
myText$text <- gsub("[\u02c6]","'",myText$text)
myText$text <- gsub("[\u00ca]","'",myText$text)
myText$text <- gsub("[\u02dc]","'",myText$text)
myText$text <- gsub("[\u00c7]","'",myText$text)
myText$text <- gsub("'"," ",myText$text)

myText$text <- gsub("�no�","no",myText$text)
myText$text <-  gsub("�no�","no",myText$text)
myText$text <-  gsub("�No�","no",myText$text)

meta.pr <- read.csv("tabella.csv", 1)
meta.pr$Name <- as.character(meta.pr$Name )
meta.pr <- meta.pr[,-2]
names(meta.pr)[4] <- "Mission_name"

fit <- merge(myText, meta.pr, by = "Name")
names(fit)[5] <- "LR"
fit <- fit[,-1]
fit$Party <- as.factor(fit$Party)
fit$Mission <-as.factor(fit$Mission )

corpus <- corpus(fit)
docnames(corpus) <- meta.pr$Name

myDfm <- dfm(corpus, remove = c(stopwords("italian"), "l", "d", "dell", "dall", "afganistan", "libano", "kosovo", "iraq", "libia", "albania")
, tolower = TRUE, stem = TRUE, remove_punct = TRUE, remove_numbers=TRUE)

########################### 
### define the word-seeds
########################### 

myDict <- dictionary(list(multilateralism= c("multilateralism", "comunit", "responsabilit", "alleanza", "alleati", "impegno", 
"sicurezza", "coalizione"),
   humanitarian_dimension= c("democrazia", "umani", "democrazia", "democratica", "diritto", "pace", "solidariet", "libert", "pacific*", 
"umanitaria", "umanitari", "solidal*"),
   war= c("guerra", "militare", "bombardamenti", "militari", "costituzione", "disarmo", "chiarezza", "violenza", "bombe", "rischi", 
"vittime")))

##########################
### fitting the seeded LDA 
########################## 

set.seed(123)
slda <- textmodel_seededlda(myDfm, myDict, residual = TRUE)

# let's extract the coefficients for each topic across documents
multilateralism<- rep(NA, nrow(myText))
for (i in 1:104){
multilateralism[i] <- slda$lda@gamma[i,][1]
}

humanitarian_dimension <- rep(NA,  nrow(myText))
for (i in 1:104){
humanitarian_dimension [i] <- slda$lda@gamma[i,][2]
}

war <- rep(NA,  nrow(myText))
for (i in 1:104){
war  [i] <- slda$lda@gamma[i,][3]
}

fit_slda  <-fit[-c(1)]
fit_slda $multilateralism <- multilateralism
fit_slda $humanitarian_dimension<- humanitarian_dimension
fit_slda $war<- war

write.csv(fit_slda, "scores_slda.csv")
